Ferritin ID Thresholds

FIGUREPATH <- "~/who_ferritin_comment/results/figures/"
BOOTPATH <- "~/who_ferritin_comment/results/boot/"
DATAPATH <- "~/who_ferritin_comment/data/"
MODELPATH <- "~/who_ferritin_comment/results/models/"
knots <- 3:25
response <- "Hemoglobin"
zoom <- T
boot_n <- 1000
Libraries
source("~/who_ferritin_comment/src/functions.R")
library(dplyr)
library(rms)
library(mcp)
library(boot)
library(scam)

Load data

# Load data
# Naming assumptions:
# Ferritin, Hemoglobin, sTfR
data <- readRDS(paste0(DATAPATH, "h2000_menstr_women_apparently_healthy.rds"))

Test RCS model selection

(If plots are too small due to layout settings, you may change the chunk option layout-ncol: 4 to 3, 2, or 1. You may also view these plots by accessing your FIGUREPATH, or by right-clicking and selecting Open image in new tab.)

Visualize RCS knots
data_full <- data
data <- data %>% select(Ferritin, as.name(response))
# set datadist
dd <- datadist(data)
options(datadist = "dd")

investigate_RCS(data, response, knots, zoom, FIGUREPATH)

Bootstrap knot selection

Select optimum number of knots via bootstrapping and AIC
# The function boot_knot_selection is defined in functions.R
if (!file.exists(paste0(BOOTPATH, response, "_rcs_knots_", boot_n, "_bootobj.rds"))) {
    boot_knots <- boot(data = data, statistic = boot_knot_selection, R = boot_n, response = response, knots = knots)
    boot_knot_ci <- boot.ci(boot_knots, type = "perc") # BCa not computable on discrete data with this tight interval
    saveRDS(boot_knots, paste0(BOOTPATH, response, "_rcs_knots_", boot_n, "_bootobj.rds"))
    saveRDS(boot_knot_ci, paste0(BOOTPATH, response, "_rcs_knot_ci_", boot_n, "_boot_ci.rds"))
} else {
    boot_knots <- readRDS(paste0(BOOTPATH, response, "_rcs_knots_", boot_n, "_bootobj.rds"))
    boot_knot_ci <- readRDS(paste0(BOOTPATH, response, "_rcs_knot_ci_", boot_n, "_boot_ci.rds"))
    }

boot_df <- data.frame(t = boot_knots$t) %>% 
    mutate(within_ci = (t >= boot_knot_ci$percent[4] & t <= boot_knot_ci$percent[5]))

bootknot_ci_p <- ggplot(data = boot_df, aes(x = t, y = ..count../sum(..count..), fill = within_ci)) + 
    geom_bar() +
    theme_minimal() +
    labs(x = "Selected number of knots",
         y = "Frequency",
         fill = "Within CI")

ggsave(paste0(FIGUREPATH, response, "_rcs_boot_knot_distr.pdf"), bootknot_ci_p, width = 8, height = 6, device = cairo_pdf)

bootknot_ci_p

Select optimum number of knots via bootstrapping and AIC
selected_knot_n <- boot_knots$t0

# Final RCS fit
rcs_fit <- ols( as.formula(sprintf("%s ~ rcs(Ferritin, %s)", response, selected_knot_n)), data = data, x = T, y = T)
saveRDS(rcs_fit, paste0(MODELPATH, "rcs_final.rds"))

Bootstrap threshold

Now use selected knot number to bootstrap saddle estimates

Find estimates for saddle point via bootstrapping
# The function boot_saddle  is defined in functions.R
if (!file.exists(paste0(BOOTPATH, response, "_rcs_saddle_", boot_n, "_bootobj.rds"))) {
    boot_saddle <- boot(data = data, statistic = boot_saddle_estimate, R = boot_n, response = response, knot_n = selected_knot_n)
    invalid_indices <- which(boot_saddle$t == 0)
    boot_saddle_ci <- boot.ci(boot_saddle, type = "perc", exclude = invalid_indices)
    saveRDS(boot_saddle, paste0(BOOTPATH, response, "_rcs_saddle_", boot_n, "_bootobj.rds"))
    saveRDS(boot_saddle_ci, paste0(BOOTPATH, response, "_rcs_saddle_ci_", boot_n, "_boot_ci.rds"))
} else {
    boot_saddle <- readRDS(paste0(BOOTPATH, response, "_rcs_saddle_", boot_n, "_bootobj.rds"))
    boot_saddle_ci <- readRDS(paste0(BOOTPATH, response, "_rcs_saddle_ci_", boot_n, "_boot_ci.rds"))
    invalid_indices <- which(boot_saddle$t == 0)
    }

paste0("Bootstrapped saddle point estimate is computable in ", round((1 - length(invalid_indices)/length(boot_saddle$t)) * 100, 2), "% of iterations.")
[1] "Bootstrapped saddle point estimate is computable in 100% of iterations."
Find estimates for saddle point via bootstrapping
rcs_final_p <- plot_rcs(data, rcs_fit, n_knot = selected_knot_n, zoom = T, FIGUREPATH = NULL) +
    annotate('rect', xmin = boot_saddle_ci$percent[4], xmax = boot_saddle_ci$percent[5], ymin = -Inf, ymax = Inf, alpha = 0.2, fill = 'black') +
    labs(subtitle = paste0("Saddle point: ", round(boot_saddle_ci$t0, 2), " μg/L [", round(boot_saddle_ci$percent[4], 2), ", ", round(boot_saddle_ci$percent[5], 2), "]"))

ggsave(paste0(FIGUREPATH, response, "_rcs_final.pdf"), rcs_final_p, width = 8, height = 6, device = cairo_pdf)

rcs_final_p

Find estimates for saddle point via bootstrapping
rcs_final_saddle_density_p <-
    ggplot(data = data.frame(t = boot_saddle$t), aes(x = t)) +
    geom_density() +
    theme_minimal() +
    labs(x = "Saddle point",
         y = "Density")

ggsave(paste0(FIGUREPATH, response, "_rcs_final_saddle_density.pdf"), rcs_final_saddle_density_p, width = 8, height = 6, device = cairo_pdf)

rcs_final_saddle_density_p

Bayesian breakpoint analysis

We limit our data to here to Ferritin <= 100, otherwise the 2 breakpoint version will place the second breakpoint way beyond it. This decision is a source of bias.

1 breakpoint

Bayesian breakpoint analysis, 1 breakpoint
concat_data <- data %>% 
    filter(Ferritin <= 100)

if (!file.exists(paste0(MODELPATH, response, "_bayesian_1bp.rds"))) {
    # Specify model
    if (response == "Hemoglobin") {
        baye_1bp_spec <- list(Hemoglobin ~ Ferritin,
                              ~ 0 + Ferritin)
    }
    if (response == "sTfR") {
        baye_1bp_spec <- list(sTfR ~ Ferritin,
                              ~ 0 + Ferritin)
    }
    
    # Fit
    baye_1bp_fit <- mcp(baye_1bp_spec, 
                   data = concat_data,
                   adapt = 500,
                   iter = 500,
                   chains = 4,
                   cores = 4)
    
    saveRDS(baye_1bp_fit, paste0(MODELPATH, response, "_bayesian_1bp.rds"))
} else {
    baye_1bp_fit <- readRDS(paste0(MODELPATH, response, "_bayesian_1bp.rds"))
    }

summary(baye_1bp_fit)
Family: gaussian(link = 'identity')
Iterations: 2000 from 4 chains.
Segments:
  1: Hemoglobin ~ Ferritin
  2: Hemoglobin ~ 1 ~ 0 + Ferritin

Population-level parameters:
       name   mean  lower  upper Rhat n.eff
 Ferritin_1  5.973  4.870  7.254  2.1    14
 Ferritin_2  0.054  0.018  0.085  1.0   333
       cp_1  7.184  6.752  7.722  1.3    55
      int_1 91.034 84.213 97.079  2.2    19
    sigma_1 10.508 10.086 10.949  1.0  2109
Bayesian breakpoint analysis, 1 breakpoint
# Plot
bay1_p <- plot(baye_1bp_fit) +
    theme_minimal() +
    coord_cartesian(xlim = c(0, 100))

posterior_check_p <- pp_check(baye_1bp_fit)

convergence_check_p <- plot_pars(baye_1bp_fit, regex_pars = "cp_")

ggsave(paste0(FIGUREPATH, response, "_bay1bp.pdf"), bay1_p, width = 8, height = 6, device = cairo_pdf)
ggsave(paste0(FIGUREPATH, response, "_bay1bp_ppcheck.pdf"), posterior_check_p, width = 8, height = 6, device = cairo_pdf)
ggsave(paste0(FIGUREPATH, response, "_bay1bp_convcheck.pdf"), convergence_check_p, width = 8, height = 6, device = cairo_pdf)

bay1_p

Bayesian breakpoint analysis, 1 breakpoint
posterior_check_p

Bayesian breakpoint analysis, 1 breakpoint
convergence_check_p

2 breakpoints

Bayesian breakpoint analysis, 2 breakpoints
if (!file.exists(paste0(MODELPATH, response, "_bayesian_2bp.rds"))) {
    # Specify model
    if (response == "Hemoglobin") {
        baye_2bp_spec <- list(Hemoglobin ~ Ferritin,
                              ~ 0 + Ferritin,
                              ~ 0 + Ferritin)
    }
    if (response == "sTfR") {
        baye_2bp_spec <- list(sTfR ~ Ferritin,
                              ~ 0 + Ferritin,
                              ~ 0 + Ferritin)
    }
    
    # Fit
    baye_2bp_fit <- mcp(baye_2bp_spec, 
                   data = concat_data,
                   adapt = 500,
                   iter = 500,
                   chains = 4,
                   cores = 4)
    
    saveRDS(baye_2bp_fit, paste0(MODELPATH, response, "_bayesian_2bp.rds"))
} else {
    baye_2bp_fit <- readRDS(paste0(MODELPATH, response, "_bayesian_2bp.rds"))
}

summary(baye_2bp_fit)
Family: gaussian(link = 'identity')
Iterations: 2000 from 4 chains.
Segments:
  1: Hemoglobin ~ Ferritin
  2: Hemoglobin ~ 1 ~ 0 + Ferritin
  3: Hemoglobin ~ 1 ~ 0 + Ferritin

Population-level parameters:
       name    mean  lower  upper Rhat n.eff
 Ferritin_1  9.2439  4.766 14.575  8.4    11
 Ferritin_2  0.3861  0.054  0.759  2.1    46
 Ferritin_3  0.0039 -0.062  0.067  1.3   245
       cp_1  5.9975  4.826  7.282  7.7    49
       cp_2 23.7243 12.319 44.856  1.4    55
      int_1 77.9793 55.778 97.703  7.4    15
    sigma_1 10.4540 10.000 10.876  1.0  1315
Bayesian breakpoint analysis, 2 breakpoints
# Plot
bay2_p <- plot(baye_2bp_fit) +
    theme_minimal() +
    coord_cartesian(xlim = c(0, 100))

posterior_check_p <- pp_check(baye_2bp_fit)

convergence_check_p <- plot_pars(baye_2bp_fit, regex_pars = "cp_")

ggsave(paste0(FIGUREPATH, response, "_bay2bp.pdf"), bay2_p, width = 8, height = 6, device = cairo_pdf)
ggsave(paste0(FIGUREPATH, response, "_bay2bp.pdf"), posterior_check_p, width = 8, height = 6, device = cairo_pdf)
ggsave(paste0(FIGUREPATH, response, "_bay2bp.pdf"), convergence_check_p, width = 8, height = 6, device = cairo_pdf)

bay2_p

Bayesian breakpoint analysis, 2 breakpoints
posterior_check_p

Bayesian breakpoint analysis, 2 breakpoints
convergence_check_p

Play with scam

scamfit <- scam(Hemoglobin ~ s(Ferritin, k = 8, bs = "mpi"), data = concat_data)
plot(scamfit, residuals = T)

From the supplement of Galetti et al. (2021):

To identify thresholds, we then used the method of finite differences to estimate the first derivatives of the fitted trend spline from the generalized additive modelling smoother and we identified the periods in which the estimated rate of change between fitted y and x was statistically significant (i.e. statistically different from zero) by computing the uncertainty around the estimates from 10 000 posterior simulations. At the point on the x-axis where 95% of the posterior simulations are all different from zero for the first derivative, we could identify where the slope between fitted y and x began to be signiicantly positive (or negative).

The issue currently is that we are deliberately choosing to shape restrict the fit to a monotonically increasing curve \(\rightarrow\) the derivative can never cross zero, so we’d have to choose an arbitrary number “close to zero” instead, which will be difficult to justify.

d1 <- derivative.scam(scamfit)
xx <- sort(concat_data$Ferritin, index = TRUE)
plot(xx$x, d1$d[xx$ix], type = "l")